{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "import numpy as np\n",
    "import re\n",
    "import collections"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "#一元n-gram 创建字典\n",
    "dic = []\n",
    "with open('input.txt','r') as f:\n",
    "    for line in f.readlines():\n",
    "        word2 = re.findall('[a-zA-Z\\']+', line)\n",
    "        dic.extend([word.lower() for word in word2])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "12631"
      ]
     },
     "execution_count": 3,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "len(set(dic))#12631个单词"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "#2元模型 n-gram 创建2个单词的组合\n",
    "dic_2 = []\n",
    "for i in range(len(dic)-1):\n",
    "    two_words = dic[i]+' '+dic[i+1]\n",
    "    dic_2.append(two_words)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "#3元 n-gram 创建3个单词的组合\n",
    "dic_3 = []\n",
    "for i in range(len(dic)-2):\n",
    "    three_words = dic[i]+' '+dic[i+1]+' '+dic[i+2]\n",
    "    dic_3.append(three_words)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "6.31277399919021e-11\n"
     ]
    }
   ],
   "source": [
    "#条件概率\n",
    "dic_num = collections.Counter(dic)\n",
    "dic_num_2 = collections.Counter(dic_2)\n",
    "dic_num_3 = collections.Counter(dic_3)\n",
    "#it's a 的概率\n",
    "#p(are|we) = p(we are)/p(we)\n",
    "print((dic_num_2['we are']/len(dic_num_2))/dic_num['we']/len(dic_num))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "sentence = ['i','will','tell','you']\n",
    "\n",
    "#创建一个共现矩阵\n",
    "def create_matrix(sentence):\n",
    "    matrix = np.zeros([len(sentence),len(sentence)])\n",
    "    for i in range(len(sentence)):\n",
    "        for j in range(len(sentence)):\n",
    "            matrix[i][j] = dic_num_2[sentence[i]+' '+sentence[j]]\n",
    "    return matrix\n",
    "\n",
    "#创建最大似然（MLP）的计数概率矩阵\n",
    "def score_matrix(sentence,co_matrix):\n",
    "    matrix = np.zeros([len(sentence),len(sentence)])\n",
    "    for i in range(len(co_matrix[0])):\n",
    "        for j in range(len(co_matrix[1])):\n",
    "            matrix[i][j] = co_matrix[i][j]/dic_num[sentence[i]]\n",
    "    return matrix"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [],
   "source": [
    "co_matrix = create_matrix(sentence)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [],
   "source": [
    "MLP_matrix = score_matrix(sentence,co_matrix)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [],
   "source": [
    "#Add-one 平滑\n",
    "#先对co_matrix所有的数值加一\n",
    "def add_one_op(co_matrix):\n",
    "    for i in range(len(co_matrix[0])):\n",
    "        for j in range(len(co_matrix[1])):\n",
    "            co_matrix[i][j] = co_matrix[i][j]+1\n",
    "    return co_matrix\n",
    "\n",
    "def new_score_matrix(sentence,co_matrix):\n",
    "    matrix = np.zeros([len(sentence),len(sentence)])\n",
    "    for i in range(len(co_matrix[0])):\n",
    "        for j in range(len(co_matrix[1])):\n",
    "            matrix[i][j] = co_matrix[i][j]/(dic_num[sentence[i]]+12631)#V = 12631\n",
    "    return matrix"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([[2.90816030e-04, 1.64601873e-02, 1.45408015e-03, 5.81632060e-05],\n",
       "       [4.16849495e-03, 7.31314904e-05, 6.58183414e-04, 3.36404856e-03],\n",
       "       [2.32684402e-04, 7.75614675e-05, 7.75614675e-05, 2.63708989e-03],\n",
       "       [1.52100894e-03, 4.11939920e-03, 8.87255213e-04, 8.23879840e-04]])"
      ]
     },
     "execution_count": 11,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "new_score_matrix(sentence,add_one_op(co_matrix))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.3"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
